{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Twitter Queries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "from requests_oauthlib import OAuth1\n",
    " \n",
    "q = 'premier league -filter:retweets AND -filter:replies'\n",
    " \n",
    "url = 'https://api.Twitter.com/1.1/search/tweets.json' ### url to Twitter API\n",
    " \n",
    "pms = {'q' : q, 'count' : 100, 'lang' : 'en', 'result_type': 'recent'} ### parameters according to Twitter API\n",
    " \n",
    "consumer_key = \"\"\n",
    "consumer_secret = \"\"\n",
    "access_token = \"\"\n",
    "access_token_secret = \"\"\n",
    "    \n",
    "auth = OAuth1(consumer_key, consumer_secret, access_token, access_token_secret)\t\n",
    " \n",
    "res = requests.get(url, params = pms, auth=auth)\n",
    "\n",
    "tweets = res.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tweets['statuses']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tweets['statuses'][0]['text']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pagination"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Iterate through 100 pages with tweets and save the data into MongoDB\n",
    "\n",
    "from pymongo import MongoClient\n",
    "\n",
    "client = MongoClient('localhost:27017')\n",
    "db = client.twitter\n",
    "collection = db.tweets\n",
    "\n",
    "pages_counter = 0\n",
    "number_of_pages = 100\n",
    "\n",
    "while pages_counter < number_of_pages:\n",
    "    pages_counter += 1\n",
    "    res = requests.get(url, params = pms, auth=auth)\n",
    "    print(\"Connection status: %s\" % res.reason)\n",
    "    tweets = res.json()\n",
    "    ids = [i['id'] for i in tweets['statuses']]\t # collect ids of all tweets to select min(val)\n",
    "    pms['max_id'] = min(ids) - 1 # because it would include and then duplicate\n",
    "    collection.insert_many(tweets['statuses']) \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stream API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pymongo import MongoClient\n",
    "from requests_oauthlib import OAuth1\n",
    " \n",
    "client = MongoClient('mongodb://localhost:27017/')\n",
    "db = client['test']\n",
    "collection = db['test']\n",
    "\n",
    "url = 'https://stream.Twitter.com/1.1/statuses/filter.json' \n",
    "auth = OAuth1(consumer_key, consumer_secret, access_token, access_token_secret)\n",
    "\n",
    "pms = {'track' : 'premier league -filter:retweets AND -filter:replies', 'lang': 'en'}\n",
    "\n",
    "res = requests.post(url, auth=auth, params = pms, stream = True)\n",
    " \n",
    "for line in res.iter_lines():   \n",
    "    if line:\n",
    "        tweet = json.loads(line)\n",
    "        try:\n",
    "           collection.insert(tweet)\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Pull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "from pymongo import MongoClient\n",
    "\n",
    " \n",
    "client = MongoClient('mongodb://localhost:27017/')\n",
    "db = client['twitter']\n",
    "collection = db['tweets']\n",
    " \n",
    "documents = []\n",
    "for doc in collection.find():\n",
    "    documents.append(doc)\n",
    "    \n",
    "df = pd.DataFrame(documents)\n",
    "                     \n",
    "\n",
    "df['tweet_source'] = df['source'].apply(lambda x: BeautifulSoup(x).get_text())\n",
    "\n",
    "devices = list(set(df[df['tweet_source'].str.startswith('Twitter')]['tweet_source']))\n",
    "                     \n",
    "devices.remove('Twitter Ads')\n",
    "                     \n",
    "df = df[df['tweet_source'].isin(devices)]\n",
    "                     \n",
    "df = df[~df['text'].str.contains(\"Ghana|ghana|jamaica|Jamaica|Ladbrokes|India|Pakistan|Ghana Premier League|Vijay|Predictions|Egyptian Premier League|cricket|Kings|Caribbean Premier League|@cricbuzz|Cricinfo\")]\n",
    "                     \n",
    "                     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.tweet_source"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import TweetTokenizer\n",
    "from nltk.corpus import stopwords\n",
    "import string\n",
    " \n",
    "df['tokens'] = df['text'].apply(TweetTokenizer().tokenize)\n",
    " \n",
    "stopwords_vocabulary = stopwords.words('english')\n",
    "df['stopwords'] = df['tokens'].apply(lambda x: [i for i in x if i.lower() not in stopwords_vocabulary])\n",
    " \n",
    "punctuations = list(string.punctuation)\n",
    " \n",
    "df['punctuation'] = df['stopwords'].apply(lambda x: [i for i in x if i not in punctuations])\n",
    "df['digits'] = df['punctuation'].apply(lambda x: [i for i in x if i[0] not in list(string.digits)])\n",
    "df['final'] = df['digits'].apply(lambda x: [i for i in x if len(i) > 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEICAYAAABI7RO5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGnhJREFUeJzt3Xm0XXV99/H3RxAnkDCkPAhIqNKqtdViFKlUrbTOBWpx\nRImWVcqqpdahilbA1qr4WGuhWhGLNVYqIo9WVCoigxUVNDgwGJEURZIyhDmAKMHv88f+XTnG5Obs\nJOeec3Pfr7XOunv/9j57f+/Zyfnc/dtTqgpJkoZ1n3EXIEmaXQwOSVIvBockqReDQ5LUi8EhSerF\n4JAk9WJwaM5KckKSo8Zdx0xL8pYkH93IZczJz04dg0MTJcm+Sb6a5NYkNyX5SpLHb4LlvjzJ+YNt\nVXV4Vb11Y5e9AbUM/cWd5LwkNye536jr6mNcn50mg8GhiZHkwcBngX8Gtgd2Af4W+Mk46xqXJAuA\n3wUK2H+sxUgDDA5Nkl8DqKqPVdU9VfXjqvpCVV08NUOSP0mytP0VfmaS3QemVZLDk1yR5JYk70vn\nkcAJwD5Jbk9yS5v/w0n+vg0/NcnyJK9Pcn2Sa5IcmOTZSb7f9n7eNLCu+yQ5Msn/JLkxyalJtm/T\nFrRaFiX5UZIbkvxNm/ZM4E3AC1st35nm8zgEuAD4MLBocEKr/X1JPpdkVZILkzxsYPpxSa5OcluS\ni5L87tpW0N5/xBptFyf5o/bZvad9HrcluSTJo9fy2e2Y5LPtM78pyZeT+N2yGXPjapJ8H7gnyeIk\nz0qy3eDEJAfQfek+D5gPfBn42BrLeC7weOC3gBcAz6iqpcDhwNeqauuqmreO9f8f4P50ezpHAx8E\nXgo8ju4v/6OS7NHmPQI4EHgK8BDgZuB9ayxvX+DXgf2Ao5M8sqo+D7wd+Hir5THTfB6HACe31zOS\n7LTG9BfR7ZFtBywD3jYw7RvAY+n23P4D+ESS+69lHYvb7whAkse03/9zwNOBJ9MF+rZ0n+eNa1nG\na4HldNtkJ7pt5L2MNmMGhyZGVd1G92VbdF/aK5OcPvCFeTjwjqpaWlWr6b6AHzu41wEcW1W3VNWP\ngHPpvjyHdTfwtqq6GzgF2BE4rqpWVdVlwHeBqS/6w4G/qarlVfUT4C3AQUm2HFje37a9pu8A3xl4\n73ol2RfYHTi1qi4C/gd4yRqzfaqqvt4+i5MHf9eq+mhV3VhVq6vq3cD96EJsTacDv5Zkzzb+MrpQ\n+2n7PLYBHgGkfe7XrGUZdwM7A7tX1d1V9eXyJnibNYNDE6V9Ob28qnYFHk331/w/tcm7A8e1LpFb\ngJuA0P2FPOXageE7ga17rP7GqrqnDf+4/bxuYPqPB5a3O/CpgVqWAvfQ/cW9KWpZBHyhqm5o4//B\nGt1V0y0/yetal96trb5t6YLwF1TVXcDHgZe27qUXA//epp0DvJduT+r6JCe241BrehfdHs8XklyZ\n5Mgev6dmIYNDE6uqvkfXv//o1nQ18GdVNW/g9YCq+uowi9vE5V0NPGuNWu5fVSs2tpYkD6DrFnpK\nkmuTXAu8GnhM60qaVjue8fq2jO1a19ytdCG7NouBg+m61O6sqq/9vNCq46vqccCj6Lqs/vqXfplu\nj+y1VfWrdAfxX5Nkv/XVqdnL4NDESPKIJK9Nsmsb343uL+AL2iwnAG9M8htt+rZJnj/k4q8Ddk2y\n1SYq9wTgbVPdZEnmt2Mww9ayYJoDyAfS7b08iq776bHAI+mO6RwyxPK3AVYDK4EtkxwNrG1PAYAW\nFD8D3k3b2wBI8vgkeye5L3AHcFeb7xckeW6ShycJXUDds7b5tPkwODRJVgF7AxcmuYMuMC6lO/hK\nVX0KeCdwSpLb2rRnDbnsc4DLgGuT3LC+mYdwHN3xgS8kWdVq3XvI936i/bwxyTfXMn0R8G9V9aOq\nunbqRddtdPAax1HW5kzg83QnG1xF94V/9Xre8xHgN4HB60seTHes6ea2nBvpuqXWtCfwReB24GvA\nv1TVuetZn2axeAxLUpJDgMOqat9x16LJ5x6HNMcleSDw58CJ465Fs4PBIc1hSZ5BdyzkOrozt6T1\nsqtKktSLexySpF7Wd3bGrLTjjjvWggULxl2GJM0qF1100Q1VNX99822WwbFgwQKWLFky7jIkaVZJ\nctUw89lVJUnqxeCQJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqZbO8clxz0x/+\n8/njLmGz8ZkjfCyH1s09DklSLyMLjiQfSnJ9kksH2rZPclaSK9rP7Vp7khyfZFmSi5PsNfCeRW3+\nK5IsGlW9kqThjHKP48PAM9doOxI4u6r2BM5u49A9N3rP9joMeD90QQMcQ/cs5ycAx0yFjSRpPEYW\nHFX138BNazQfACxuw4uBAwfaP1KdC4B5SXYGngGcVVU3VdXNwFn8chhJkmbQTB/j2KmqrmnD1wI7\nteFdgKsH5lve2tbV/kuSHJZkSZIlK1eu3LRVS5J+bmwHx6t7Zu0me25tVZ1YVQurauH8+et9Dokk\naQPNdHBc17qgaD+vb+0rgN0G5tu1ta2rXZI0JjMdHKcDU2dGLQI+PdB+SDu76onAra1L60zg6Um2\nawfFn97aJEljMrILAJN8DHgqsGOS5XRnRx0LnJrkUOAq4AVt9jOAZwPLgDuBVwBU1U1J3gp8o833\nd1W15gF3SdIMGllwVNWL1zFpv7XMW8Ar17GcDwEf2oSlrZdXIG86XoEsbX68clyS1IvBIUnqxeCQ\nJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9GBySpF4M\nDklSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnq\nxeCQJPVicEiSejE4JEm9GBySpF4MDklSL2MJjiSvTnJZkkuTfCzJ/ZPskeTCJMuSfDzJVm3e+7Xx\nZW36gnHULEnqzHhwJNkF+EtgYVU9GtgCeBHwTuA9VfVw4Gbg0PaWQ4GbW/t72nySpDEZV1fVlsAD\nkmwJPBC4BngacFqbvhg4sA0f0MZp0/dLkhmsVZI0YMaDo6pWAP8A/IguMG4FLgJuqarVbbblwC5t\neBfg6vbe1W3+HdZcbpLDkixJsmTlypWj/SUkaQ4bR1fVdnR7EXsADwEeBDxzY5dbVSdW1cKqWjh/\n/vyNXZwkaR3G0VX1+8APqmplVd0NfBJ4EjCvdV0B7AqsaMMrgN0A2vRtgRtntmRJ0pRxBMePgCcm\neWA7VrEf8F3gXOCgNs8i4NNt+PQ2Tpt+TlXVDNYrSRowjmMcF9Id5P4mcEmr4UTgDcBrkiyjO4Zx\nUnvLScAOrf01wJEzXbMk6V5brn+WTa+qjgGOWaP5SuAJa5n3LuD5M1GXJGn9vHJcktSLwSFJ6sXg\nkCT1YnBIknoxOCRJvaw3OJI8P8k2bfjNST6ZZK/RlyZJmkTD7HEcVVWrkuxLd9X3ScD7R1uWJGlS\nDRMc97SfzwFOrKrPAVuNriRJ0iQbJjhWJPkA8ELgjCT3G/J9kqTN0DAB8ALgTOAZVXULsD3w1yOt\nSpI0sYYJjg9U1Ser6gqAqroGeNloy5IkTaphguM3BkeSbAE8bjTlSJIm3TqDI8kbk6wCfivJbUlW\ntfHrufeW55KkOWadwVFV76iqbYB3VdWDq2qb9tqhqt44gzVKkibIMF1Vf5PkpUmOAkiyW5Jfuv25\nJGluGCY43gfsA7ykjd/e2iRJc9AwD3Lau6r2SvItgKq6OYkXAErSHDXMHsfd7UyqAkgyH/jZSKuS\nJE2sYYLjeOBTwE5J3gacD7x9pFVJkibWeruqqurkJBcB+wEBDqyqpSOvTJI0kYa959SOwJ1V9V7g\nhiR7jLAmSdIEG+Z5HMcAbwCmrt24L/DRURYlSZpcw+xx/BGwP3AHQFX9L7DNKIuSJE2uYYLjp1VV\n3HtW1YNGW5IkaZINExyntudxzEvyp8AXgQ+OtixJ0qQa5qyqf0jyB8BtwK8DR1fVWSOvTJI0kdYb\nHEkOBf67qnx4kyRpqFuOPBT4QJIFwEXAfwNfrqpvj7AuSdKEWu8xjqo6pqqeRvdApy/TPTb2olEX\nJkmaTMN0Vb0ZeBKwNfAt4HV0ASJJmoOG6ap6HrAa+BzwJeBrVfWTkVYlSZpYw3RV7QX8PvB14A+A\nS5KcP+rCJEmTaZhbjjwaOBhYBLwQWAGcszErTTIvyWlJvpdkaZJ9kmyf5KwkV7Sf27V5k+T4JMuS\nXJxkr41ZtyRp4wxzAeCxdLcYOR54ZFX9XlUdvZHrPQ74fFU9AngMsBQ4Eji7qvYEzm7jAM8C9myv\nw4D3b+S6JUkbYZjg+GJV/d+q+mpV3Q2Q5FUbusIk2wJPBk4CqKqfVtUtwAHA4jbbYuDANnwA8JHq\nXEB3BfvOG7p+SdLGGSY4DllL28s3Yp17ACuBf0vyrST/2u5/tVNVXdPmuRbYqQ3vAlw98P7lre0X\nJDksyZIkS1auXLkR5UmSprPOs6qSvBh4CbBHktMHJm0D3LSR69wLOKKqLkxyHPd2SwFQVZWk+iy0\nqk4ETgRYuHBhr/dKkoY33em4XwWuoXuI07sH2lcBF2/EOpcDy6vqwjZ+Gl1wXJdk56q6pnVFXd+m\nrwB2G3j/rq1NkjQG6wyOqroKuArYZ1OusKquTXJ1kl+vqsvpHkn73fZaRHcwfhHw6faW04G/SHIK\nsDdw60CXliRphg1zAeAoHAGcnGQr4ErgFXTHW05tN1W8CnhBm/cM4NnAMuDONq8kaUzGEhztBokL\n1zJpv7XMW8ArR16UJGko6zyrKsnZ7ec7Z64cSdKkm26PY+ckvwPs344vZHBiVX1zpJVJkibSdMFx\nNHAU3VlM/7jGtAKeNqqiJEmTa7qzqk4DTktyVFW9dQZrkiRNsGGeOf7WJPvT3SYE4Lyq+uxoy5Ik\nTaph7o77DuBV3HutxauSvH3UhUmSJtMwp+M+B3hsVf0MIMliuicBvmmUhUmSJtMwNzkEmDcwvO0o\nCpEkzQ7D7HG8A/hWknPpTsl9MmvclFCSNHcMc3D8Y0nOAx7fmt5QVdeOtCpJ0sQa6pYj7aaCp693\nRknSZm/YYxySJAEGhySpp2mDI8kWSb43U8VIkibftMFRVfcAlyd56AzVI0macMMcHN8OuCzJ14E7\nphqrav+RVSVJmljDBMdRI69CkjRrDHMdx5eS7A7sWVVfTPJAYIvRlyZJmkTD3OTwT4HTgA+0pl2A\n/xxlUZKkyTXM6bivBJ4E3AZQVVcAvzLKoiRJk2uY4PhJVf10aiTJlnRPAJQkzUHDBMeXkrwJeECS\nPwA+AXxmtGVJkibVMMFxJLASuAT4M+AM4M2jLEqSNLmGOavqZ+3hTRfSdVFdXlV2VUnSHLXe4Ejy\nHOAE4H/onsexR5I/q6r/GnVxkqTJM8wFgO8Gfq+qlgEkeRjwOcDgkKQ5aJhjHKumQqO5Elg1onok\nSRNunXscSZ7XBpckOQM4le4Yx/OBb8xAbZLmiD/85/PHXcJm4zNH7DvydUzXVfWHA8PXAU9pwyuB\nB4ysIknSRFtncFTVK2ayEEnS7DDMWVV7AEcACwbn97bqkjQ3DXNW1X8CJ9FdLf6zTbXiJFsAS4AV\nVfXcFlCnADsAFwEvq6qfJrkf8BHgccCNwAur6oebqg5JUj/DnFV1V1UdX1XnVtWXpl6bYN2vApYO\njL8TeE9VPRy4GTi0tR8K3Nza39PmkySNyTDBcVySY5Lsk2SvqdfGrDTJrsBzgH9t4wGeRnf7doDF\nwIFt+IA2Tpu+X5tfkjQGw3RV/SbwMrov9qmuqmrjG+qfgNcD27TxHYBbqmp1G19O99wP2s+rAapq\ndZJb2/w3DC4wyWHAYQAPfaiPSJekURkmOJ4P/OrgrdU3RpLnAtdX1UVJnroplglQVScCJwIsXLjQ\ne2lJ0ogMExyXAvOA6zfROp8E7J/k2cD9gQcDxwHzkmzZ9jp2BVa0+VcAuwHL27NAtqU7SC5JGoNh\njnHMA76X5Mwkp0+9NnSFVfXGqtq1qhYALwLOqaqDgXOBg9psi4BPt+HT2zht+jnenVeSxmeYPY5j\nRl5F5w3AKUn+HvgW3SnAtJ//nmQZcBNd2EiSxmSY53FsilNv17Xs84Dz2vCVwBPWMs9ddMdZJEkT\nYJgrx1dx7zPGtwLuC9xRVQ8eZWGSpMk0zB7H1CmzU9dbHAA8cZRFSZIm1zAHx3+uOv8JPGNE9UiS\nJtwwXVXPGxi9D7AQuGtkFUmSJtowZ1UNPpdjNfBDuu4qSdIcNMwxDp/LIUn6uekeHXv0NO+rqnrr\nCOqRJE246fY47lhL24PobnO+A2BwSNIcNN2jY989NZxkG7rnZ7yC7mFL717X+yRJm7dpj3Ek2R54\nDXAw3TMx9qqqm2eiMEnSZJruGMe7gOfR3ar8N6vq9hmrSpI0saa7APC1wEOANwP/m+S29lqV5LaZ\nKU+SNGmmO8bR66pySdLcYDhIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBI\nknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvcx4cCTZLcm5Sb6b\n5LIkr2rt2yc5K8kV7ed2rT1Jjk+yLMnFSfaa6ZolSfcaxx7HauC1VfUo4InAK5M8CjgSOLuq9gTO\nbuMAzwL2bK/DgPfPfMmSpCkzHhxVdU1VfbMNrwKWArsABwCL22yLgQPb8AHAR6pzATAvyc4zXLYk\nqRnrMY4kC4DfBi4Edqqqa9qka4Gd2vAuwNUDb1ve2tZc1mFJliRZsnLlypHVLElz3diCI8nWwP8D\n/qqqbhucVlUFVJ/lVdWJVbWwqhbOnz9/E1YqSRo0luBIcl+60Di5qj7Zmq+b6oJqP69v7SuA3Qbe\nvmtrkySNwTjOqgpwErC0qv5xYNLpwKI2vAj49ED7Ie3sqicCtw50aUmSZtiWY1jnk4CXAZck+XZr\nexNwLHBqkkOBq4AXtGlnAM8GlgF3Aq+Y2XIlSYNmPDiq6nwg65i831rmL+CVIy1KkjQ0rxyXJPVi\ncEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9GBySpF4MDklS\nLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqxeCQ\nJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqZdYER5JnJrk8ybIkR467Hkmaq2ZF\ncCTZAngf8CzgUcCLkzxqvFVJ0tw0K4IDeAKwrKqurKqfAqcAB4y5Jkmak7YcdwFD2gW4emB8ObD3\n4AxJDgMOa6O3J7l8hmoblx2BG8ZdxPrkL8ddwUSa+G3ndlurid9usNHbbvdhZpotwbFeVXUicOK4\n65gpSZZU1cJx16H+3Hazk9vtXrOlq2oFsNvA+K6tTZI0w2ZLcHwD2DPJHkm2Al4EnD7mmiRpTpoV\nXVVVtTrJXwBnAlsAH6qqy8Zc1rjNmW65zZDbbnZyuzWpqnHXIEmaRWZLV5UkaUIYHJKkXgyOWSbJ\n4UkOacMvT/KQgWn/6hX1s0OSBUlesoHvvX1T16N+ksxL8ucD4w9Jcto4a5pJHuOYxZKcB7yuqpaM\nuxb1k+SpdNvuuWuZtmVVrZ7mvbdX1dajrE/TS7IA+GxVPXrMpYyFexwzqP2V+b0kJydZmuS0JA9M\nsl+SbyW5JMmHktyvzX9sku8muTjJP7S2tyR5XZKDgIXAyUm+neQBSc5LsrDtlbxrYL0vT/LeNvzS\nJF9v7/lAuw+YhtS24dIkH0xyWZIvtM/+YUk+n+SiJF9O8og2/4fbtpp6/9TewrHA77bt8Oq2jU5P\ncg5wdpKtk5yd5Jvt34W32OlhA7bTw5Jc0D7rv5/aTtNsh2OBh7Xt9662vkvbey5I8hsDtUz9v3xQ\n+//99fb/ffZu06ryNUMvYAFQwJPa+IeAN9PdTuXXWttHgL8CdgAu5969wnnt51vo/lIFOA9YOLD8\n8+jCZD7dvb2m2v8L2Bd4JPAZ4L6t/V+AQ8b9ucymV9uGq4HHtvFTgZcCZwN7tra9gXPa8IeBgwbe\nf3v7+VS6v1in2l9Odyud7dv4lsCD2/COwLKBfwu3j/tzmPTXBmynzwIvbsOHD2yntW6HtvxL11jf\npW341cDftuGdgcvb8NuBl7bhecD3gQeN+7PakJd7HDPv6qr6Shv+KLAf8IOq+n5rWww8GbgVuAs4\nKcnzgDuHXUFVrQSuTPLEJDsAjwC+0tb1OOAbSb7dxn91E/xOc80Pqurbbfgiui+N3wE+0T7XD9B9\nYfR1VlXd1IYDvD3JxcAX6e7XttNGVT339NlO+wCfaMP/MbCMDdkOpwJTe5kvAKaOfTwdOLKt+zzg\n/sBDe/9WE2BWXAC4mVnzoNItdHsXvzhTd9HjE+i+3A8C/gJ4Wo/1nEL3j/Z7wKeqqpIEWFxVb9yg\nyjXlJwPD99B9kdxSVY9dy7yraV3CSe4DbDXNcu8YGD6Ybs/xcVV1d5If0n3RaHh9ttO69N4OVbUi\nyY1Jfgt4Id0eDHQh9MdVNetvwOoex8x7aJJ92vBLgCXAgiQPb20vA76UZGtg26o6g27X9zFrWdYq\nYJt1rOdTdLeefzFdiEC3m35Qkl8BSLJ9kqHuhqlp3Qb8IMnzAdKZ2l4/pNvLA9gfuG8bnm7bAWwL\nXN++rH6PIe9aqmlNt50uAP64Db9o4D3r2g7r234fB15P93/44tZ2JnBE+wOOJL+9sb/QuBgcM+9y\n4JVJlgLbAe8BXkG3+3wJ8DPgBLp/lJ9tu8jnA69Zy7I+DJwwdXB8cEJV3QwsBXavqq+3tu/SHVP5\nQlvuWWxYl4p+2cHAoUm+A1zGvc+L+SDwlNa+D/fuVVwM3JPkO0levZblnQwsbP8mDqHbc9TGW9d2\n+ivgNe3/xcPpuophHduhqm4EvpLk0sETUQacRhdApw60vZXuD4eLk1zWxmclT8edQZnjp/BJkyrJ\nA4Efty7dF9EdKJ+9Zz2NmMc4JKnrTnxv60a6BfiTMdcz0dzjkCT14jEOSVIvBockqReDQ5LUi8Eh\nSerF4JAk9fL/Ac9eOflGbVJ2AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x11075a630>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "import matplotlib.pyplot as plt\n",
    " \n",
    "sentiment = SentimentIntensityAnalyzer()\n",
    " \n",
    "df['sentiment'] = df.text.apply(lambda x: sentiment.polarity_scores(x)['compound'])\n",
    "\n",
    "#sentiment.polarity_scores(text) #test method\n",
    "\n",
    "pos = len(df[df.sentiment > 0])\n",
    "neg = len(df[df.sentiment < 0])\n",
    "neu = len(df[df.sentiment == 0])\n",
    "\n",
    "y = [pos, neu, neg]\t# vector of y-values\n",
    " \n",
    "plt.title(\"Sentiment Analysis\")\n",
    "plt.ylabel('Number of tweets')\n",
    "plt.xticks(range(len(y)), ['positive', 'neutral', 'negative'])\n",
    "plt.bar(range(len(y)), height=y, width = 0.75, align = 'center', alpha = 0.8)\n",
    " \n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Customized sentiment analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "#You need to tag the dataset before\n",
    "dataset  = pd.read_pickle('tagged.pickle')\n",
    "\n",
    "classes = ['pos', 'neu', 'neg']\n",
    "train_data = dataset['final'][0:80]\n",
    "train_labels = dataset['label'][0:80]\n",
    "test_data = dataset['final'][80:96]\n",
    "test_labels = dataset['label'][80:96]\n",
    "train_data = list(train_data.apply(' '.join))\n",
    "test_data = list(test_data.apply(' '.join))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6875"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True)\n",
    "\n",
    "train_vectors = vectorizer.fit_transform(train_data)\n",
    "test_vectors = vectorizer.transform(test_data)\n",
    "\n",
    "### Perform a logistic regression model, and fit with X and y\n",
    "\n",
    "nb = MultinomialNB()\n",
    "\n",
    "nb.fit(train_vectors, train_labels).score(test_vectors, test_labels)"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "\n",
    "print(\"Naive Bayes\")\n",
    "print(classification_report(test_labels, nb.predict(test_vectors)))\n",
    "print(confusion_matrix(test_labels, nb.predict(test_vectors)))\n",
    "predicted = cross_val_predict(nb, train_vectors, train_labels, cv=10)\n",
    "print(\"Cross validation %s\" % accuracy_score(train_labels, predicted))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NER Recognition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk.tag import StanfordNERTagger\n",
    "from collections import Counter\n",
    "import numpy as np\n",
    "\n",
    "st = StanfordNERTagger('path_to_your_folder/english.all.3class.distsim.crf.ser.gz') \n",
    "st.tag(sentence.split())\n",
    "\n",
    "for r in tweets:\n",
    "    lst_tags = st.tag(r.split())\n",
    "\n",
    "for tup in lst_tags:\n",
    "    if(tup[1] != 'O'):\n",
    "        entities.append(tup)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "organizations = df_entities[df_entities['ner'].str.contains(\"ORGANIZATION\")]\n",
    "\n",
    "cnt = Counter(organizations['word'])\n",
    "cnt.most_common(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combining NER and sentiment analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "subset = dataset[dataset['tweet'].str.contains('Liverpool')]\n",
    "avg_sentiment = np.mean(subset['sentiment'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
